#ifndef _SIMD_CPP_HPP
#define _SIMD_CPP_HPP
#define __wrapped_intrinsic__ __attribute__((always_inline, pure)) inline
#define __wrapped_mem_intrinsic__ __attribute__((always_inline)) inline
#ifdef __sw_256bit_simd__
typedef int vint __attribute__ ((__mode__(__V1OI__)));
typedef double vreal __attribute__ ((__mode__(__V4DF__)));
#define VEC_MASK 3
#define VEC_WIDTH 4
#include "esmd_types.h"
static __wrapped_mem_intrinsic__ void v_std(real *dest, vreal a) {
  __builtin_sw_vstd_f(dest, a);
}
static __wrapped_mem_intrinsic__ vint v_stl(long *dest, vint a) {
  __builtin_sw_vstd_o(dest, a);
}
static __wrapped_mem_intrinsic__ vreal v_ldd(real *src) {
  vreal ret = (vreal)__builtin_sw_vldd_f((real *)src);
  return ret;
}
static __wrapped_mem_intrinsic__ vreal v_lde(real *src) {
  vreal ret = (vreal)__builtin_sw_ldde((real *)src);
  return ret;
}

static __wrapped_mem_intrinsic__ vreal v_lds(float *src) {
  vreal ret = (vreal)__builtin_sw_vlds((float *)src);
  return ret;
}
static __wrapped_mem_intrinsic__ vint v_ldl(long *src) {
  vint ret = (vint)__builtin_sw_vldd_o((long *)src);
  return ret;
}
static __wrapped_intrinsic__ vreal v_setd(real f0, real f1, real f2, real f3) {
  vreal ret;
  ret = __builtin_sw_vinsfd(f0, ret, 0);
  ret = __builtin_sw_vinsfd(f1, ret, 1);
  ret = __builtin_sw_vinsfd(f2, ret, 2);
  ret = __builtin_sw_vinsfd(f3, ret, 3);
  return ret;
}
static __wrapped_intrinsic__ vreal v_shff(vreal a, vreal b, int mask) {
  vreal ret;
  asm ("vshff %1, %2, %3, %0\n\t" : "=r"(ret) : "r"(a), "r"(b), "ir"(mask));
  return ret;
}
static __wrapped_intrinsic__ real v_extf0(vreal a) {
  return (real)__builtin_sw_vextfd(a, 0);
}
static __wrapped_intrinsic__ real v_sum(vreal a) {
  vreal t = a + v_shff(a, a, 0xb1);
  t = t + v_shff(t, t, 0x4e);
  return (real)v_extf0(t);
}

static __wrapped_intrinsic__ vreal v_selle(vreal a, vreal b, vreal c) {
  return (vreal)__builtin_sw_vselled(a, b, c);
}
static __wrapped_intrinsic__ vreal v_sellt(vreal a, vreal b, vreal c) {
  return (vreal)__builtin_sw_vselltd(a, b, c);
}
static __wrapped_intrinsic__ vreal v_selgt(vreal a, vreal b, vreal c) {
  return (vreal)__builtin_sw_vselled(a, c, b);
}
static __wrapped_intrinsic__ vreal v_set1d(real a) {
  vreal t;
  asm("vshff %1, %1, 0, %0\n\t"
      : "=r"(t)
      : "r"(a));
  return t;
}
static __wrapped_intrinsic__ vint v_set1l(long long a) {
  vint t;
  asm("vshff %1, %1, 0, %0\n\t"
      : "=r"(t)
      : "r"(a));
  return t;
}
static __wrapped_intrinsic__ vreal v_seleq(vreal a, vreal b, vreal c) {
  return (vreal)__builtin_sw_vseleqd(a, b, c);
}
static __wrapped_intrinsic__ vreal v_fcmplt(vreal a, vreal b) {
  return (vreal)__builtin_sw_vfcmpltd(a, b);
}
static __wrapped_intrinsic__ vreal v_fcmpgt(vreal a, vreal b) {
  return (vreal)__builtin_sw_vfcmpltd(b, a);
}
static __wrapped_intrinsic__ vreal v_fcmpeq(vreal a, vreal b) {
  return (vreal)__builtin_sw_vfcmpeqd(b, a);
}
static __wrapped_intrinsic__ vreal v_icmpeq(vint a, vint b) {
  union {
    vint l;
    vreal d;
  } tmp;
  tmp.l = a - b;
  return v_seleq(tmp.d, v_set1d(0), v_set1d(2.0));
}
static __wrapped_intrinsic__ vreal v_cpys(vreal a, vreal b) {
  return __builtin_sw_vcpysd(a, b);
}
static __wrapped_intrinsic__ vreal v_castif(vint i) {
    union {
    vint l;
    vreal d;
  } tmp;
  tmp.l = i;
  return tmp.d;
}

#ifdef __cplusplus
static __wrapped_intrinsic__ void simd_load_xyzvec(vreal &v0, vreal &v1, vreal &v2, real *base) {
  vreal l2, t0;
  v0 = v_ldd(base);
  v1 = v_ldd(base + 4);
  l2 = v_ldd(base + 8);

  t0 = v_shff(l2, v1, 0x6b);
  v2 = v_shff(v1, v0, 0x46);
  v0 = v_shff(t0, v0, 0xdc);
  v1 = v_shff(t0, v2, 0x89);
  v2 = v_shff(l2, v2, 0xcc);
}
#else
static __wrapped_intrinsic__ void simd_load_xyzvec(vreal *v0, vreal *v1, vreal *v2, real *base) {
  vreal l2, t0;
  *v0 = v_ldd(base);
  *v1 = v_ldd(base + 4);
  l2 = v_ldd(base + 8);

  t0 = v_shff(l2, *v1, 0x6b);
  *v2 = v_shff(*v1, *v0, 0x46);
  *v0 = v_shff(t0, *v0, 0xdc);
  *v1 = v_shff(t0, *v2, 0x89);
  *v2 = v_shff(l2, *v2, 0xcc);
}
#endif
static __wrapped_intrinsic__ int v_any(vreal v) {
  int r;

  asm("vmatch %1, %2, %0\n\t"
      : "=r"(r)
      : "r"(v), "r"(0x40000000));
  return r;
}
static __wrapped_intrinsic__ vreal v_sqrt(vreal x) {
  return __builtin_sw_vsqrtd(x);
}
static __wrapped_intrinsic__ vreal v_divd(vreal x, vreal y) {
  return x / y;
}
// static __wrapped_intrinsic__ vreal v_invsqrt(vreal x) {
//   vreal y;
//   asm ("srlow %1, 1, %0\n\t"
//        "vshff %2, %2, 0, %2\n\t"
//        "vcpys $31, %0, %0\n\t"
//        "vsubl %2, %0, %0\n\t":"=&r"(y) : "r"(x), "r"(0x5fe6ec85e7de30daL));
//   vreal f1_5v4 = v_set1d(1.5), xhalf = v_set1d(0.5) * x;
//   y = y * (f1_5v4 - xhalf * y * y);
//   y = y * (f1_5v4 - xhalf * y * y);
//   y = y * (f1_5v4 - xhalf * y * y);
//   return y;
// }
template<typename T>
__always_inline T touch(T &ref){
  asm volatile("" : "+r"(ref));
  return ref;
}
template<typename T>
T tfma(T a, T b, T c){
  return a * b + c;
}
__attribute__((section(".ldm"), weak)) unsigned long MAGIC_NUM = 0x5fe6eb50c7aa19f9L;
__attribute__ ((section (".ldm"), weak)) double F1_5 = 1.5;
union longv4u {
  long sval[4];
  vint vval;
};
__attribute__((section(".ldm"), weak)) longv4u MAGICV4 = {.sval = {0x5fe6eb50c7aa19f9L, 0x5fe6eb50c7aa19f9L, 0x5fe6eb50c7aa19f9L, 0x5fe6eb50c7aa19f9L}};
static __wrapped_intrinsic__ vreal v_invsqrt(vreal x) {
  vreal t1, y, f15, t2, t0;
  long x2 = 0x5fe6eb50c7aa19f9L;
  asm volatile("vinsf $29, $30, 1, $30\n\t"
               "ldih $29, 0x3ff8($31)\n\t"
               "srlow %[X], 1, %[Y]\n\t"
               "sll  $29, 32, $29\n\t"
               "vshff %[X2], %[X2], 0, %[X2]\n\t"
               "vshff $29, $29, 0, $29\n\t"
               "vcpys $31, %[Y], %[Y]\n\t"
               "vsubl %[X2], %[Y], %[Y]\n\t"
               "vmsd %[X], $29, %[X], %[X2]\n\t"
               "vmuld %[Y], $29, %[X]\n\t"
               "vmuld %[Y], %[Y], $at\n\t"
               "vmuld %[Y], %[X2], %[Y]\n\t"
               "vnmad %[Y], $at, %[X], %[Y]\n\t"
               "vmuld %[Y], $29, %[X]\n\t"
               "vmuld %[Y], %[Y], $at\n\t"
               "vmuld %[Y], %[X2], %[Y]\n\t"
               "vnmad %[Y], $at, %[X], %[Y]\n\t"
               "vmuld %[Y], $29, %[X]\n\t"
               "vmuld %[Y], %[Y], $at\n\t"
               "vmuld %[Y], %[X2], %[Y]\n\t"
               "vnmad %[Y], $at, %[X], %[Y]\n\t"
               "vextf $30, 1, $29\n\t"
               : [Y] "=&r"(y), [X2] "+&r"(x2), [X]"+&r"(x));
  return y;

  // vreal f1_5v;
  // asm("vshff %1, %1, 0, %0\n\t" :"=r"(f1_5v) : "r"(F1_5));
  // vreal x2 = x - x * f1_5v;
  // union {
  //   vint l;
  //   vreal d;
  // } tmp, magic;
  // magic.d = __builtin_sw_vcpyfd(*(double*)&MAGIC_NUM);
  // tmp.d = x;
  // tmp.l >>= 1;
  // tmp.d = __builtin_sw_vcpysd(f1_5v, tmp.d);

  // tmp.l = magic.l - tmp.l;
  

  // vreal y = tmp.d;
  // vreal y15;
  // y15 = f1_5v * y;
  // y = tfma(y*y, x2*y, touch(y15));
  // y15 = f1_5v * y;
  // y = tfma(y*y, x2*y, touch(y15));
  // y15 = f1_5v * y;
  // y = tfma(y*y, x2*y, touch(y15));

  // return y;

}
#endif
#ifdef __sw_512bit_simd__
#define VEC_MASK 7
#define VEC_WIDTH 8
#include "esmd_types.h"
typedef int vint __attribute__ ((__mode__(__V1XI__)));
typedef double vreal __attribute__ ((__mode__(__V8DF__)));
typedef int iv16 __attribute__ ((__mode__(__V16SI__)));
static __wrapped_mem_intrinsic__ void v_std(real *dest, vreal a) {
  __builtin_sw_slave_vstd_f(dest, a);
}
static __wrapped_mem_intrinsic__ vreal v_ldd(real *src) {
  vreal ret = (vreal)__builtin_sw_slave_vldd_f((real *)src);
  return ret;
}
static __wrapped_mem_intrinsic__ vint v_ldl(long *src) {
  vint ret = (vint)__builtin_sw_slave_vldd_x((long *)src);
  return ret;
}
static __wrapped_intrinsic__ vreal v_set1d(real a) {
  // return __builtin_sw_slave_vcpyfd(a);
  vreal ret;
  asm ("vcpyf %1, %0\n\t" : "=r"(ret) : "r"(a));
  return ret;
}
static __wrapped_intrinsic__ vint v_set1l(long a) {
  // return __builtin_sw_slave_vcpyfl(a);
  vint ret;
  asm ("vcpyf %1, %0\n\t" : "=r"(ret) : "r"(a));
  return ret;
}
static __wrapped_intrinsic__ vreal v_shff(vreal a, vreal b, int mask) {
  vreal ret;
  asm ("vshff %1, %2, %3, %0\n\t" : "=r"(ret) : "r"(a), "r"(b), "ir"(mask));
  return ret;
}
static __wrapped_intrinsic__ real v_extf0(vreal a) {
  return (real)__builtin_sw_slave_vextfd(a, 0);
}

static __wrapped_intrinsic__ vreal v_selle(vreal a, vreal b, vreal c) {
  return (vreal)__builtin_sw_slave_vselled(a, b, c);
}
static __wrapped_intrinsic__ vreal v_sellt(vreal a, vreal b, vreal c) {
  return (vreal)__builtin_sw_slave_vselltd(a, b, c);
}
static __wrapped_intrinsic__ vreal v_selgt(vreal a, vreal b, vreal c) {
  return (vreal)__builtin_sw_slave_vselled(a, c, b);
}
static __wrapped_intrinsic__ vreal v_seleq(vreal a, vreal b, vreal c) {
  return (vreal)__builtin_sw_slave_vseleqd(a, b, c);
}
static __wrapped_intrinsic__ vreal v_fcmplt(vreal a, vreal b) {
  return (vreal)__builtin_sw_slave_vfcmpltd(a, b);
}
static __wrapped_intrinsic__ vreal v_fcmpeq(vreal a, vreal b) {
  return (vreal)__builtin_sw_slave_vfcmpeqd(a, b);
}
static __wrapped_intrinsic__ vreal v_fcmpgt(vreal a, vreal b) {
  return (vreal)__builtin_sw_slave_vfcmpltd(b, a);
}
static __wrapped_intrinsic__ real v_sum(vreal a) {
  return (real)__builtin_sw_slave_reduc_plusd(a);
}
static __wrapped_intrinsic__ vreal v_icmpeq(vint a, vint b) {
  union {
    vint l;
    vreal d;
  } tmp;
  tmp.l = a - b;
  return v_seleq(tmp.d, v_set1d(0), v_set1d(1.0));
}
static __wrapped_intrinsic__ vreal v_cpys(vreal a, vreal b) {
  return __builtin_sw_slave_vcpysd(a, b);
}
#ifdef __cplusplus
static __wrapped_intrinsic__ iv16 v_set_maskw(int a0, int a1, int a2, int a3, int a4, int a5, int a6, int a7, int a8, int a9, int a10, int a11, int a12, int a13, int a14, int a15){
  iv16 mask;
  int lo = (a0 &31)<< 0 | (a1 &31)<< 5  | (a2 &31)<<10 | (a3 &31)<<15 | (a4 &31)<<20 | (a5 &31)<<25 | (a6 &31)<<30;
  int md = (a6 &31)>> 2 | (a7 &31)<< 3  | (a8 &31)<< 8 | (a9 &31)<<13 | (a10&31)<<18 | (a11&31)<<23 | (a12&31)<<28;
  int hi = (a12&31)>> 4 | (a13&31)<< 1  | (a14&31)<< 6 | (a15&31)<<11;
  mask = __builtin_sw_slave_vinsw(lo, mask, 0);
  mask = __builtin_sw_slave_vinsw(md, mask, 1);
  mask = __builtin_sw_slave_vinsw(hi, mask, 2);
  return mask;
}
static __wrapped_intrinsic__ iv16 v_set_maskd(int a0, int a1, int a2, int a3, int a4, int a5, int a6, int a7){
  return v_set_maskw(a0*2,a0*2+1,a1*2,a1*2+1,a2*2,a2*2+1,a3*2,a3*2+1,a4*2,a4*2+1,a5*2,a5*2+1,a6*2,a6*2+1,a7*2,a7*2+1);
}
static __wrapped_intrinsic__ void simd_load_xyzvec(vreal &v0, vreal &v1, vreal &v2, real *base) {
  iv16
    xmasklo = v_set_maskd(0, 3, 6, 9, 12, 15, 0, 0),
    xmaskhi = v_set_maskd(0, 1, 2, 3, 4, 5, 10, 13),
    ymasklo = v_set_maskd(1, 4, 7, 10, 13, 0, 0, 0),
    ymaskhi = v_set_maskd(0, 1, 2, 3, 4, 8, 11, 14),
    zmasklo = v_set_maskd(2, 5, 8, 11, 14, 0, 0, 0),
    zmaskhi = v_set_maskd(0, 1, 2, 3, 4, 9, 12, 15);

  union {
    vreal d;
    vint l;
    iv16 i;
  } v0u, v1u, v2u, xtu, ytu, ztu, t0u, t1u, t2u;
  
  t0u.d = v_ldd(base);
  t1u.d = v_ldd(base + 8);
  t2u.d = v_ldd(base + 16);

  asm("vshfw %1, %2, %3, %0\n\t" : "=r"(xtu.d): "r"(t0u.d), "r"(t1u.d), "r"(xmasklo));
  asm("vshfw %1, %2, %3, %0\n\t" : "=r"(v0u.d): "r"(xtu.d), "r"(t2u.d), "r"(xmaskhi));
  asm("vshfw %1, %2, %3, %0\n\t" : "=r"(ytu.d): "r"(t0u.d), "r"(t1u.d), "r"(ymasklo));
  asm("vshfw %1, %2, %3, %0\n\t" : "=r"(v1u.d): "r"(ytu.d), "r"(t2u.d), "r"(ymaskhi));
  asm("vshfw %1, %2, %3, %0\n\t" : "=r"(ztu.d): "r"(t0u.d), "r"(t1u.d), "r"(zmasklo));
  asm("vshfw %1, %2, %3, %0\n\t" : "=r"(v2u.d): "r"(ztu.d), "r"(t2u.d), "r"(zmaskhi));
  v0 = v0u.d;
  v1 = v1u.d;
  v2 = v2u.d;
}

#else
static __wrapped_mem_intrinsic__ void simd_load_xyzvec(vreal *v0, vreal *v1, vreal *v2, real *base) {
  iv16
    xmasklo = v_set_maskd(0, 3, 6, 9, 12, 15, 0, 0),
    xmaskhi = v_set_maskd(0, 1, 2, 3, 4, 5, 10, 13),
    ymasklo = v_set_maskd(1, 4, 7, 10, 13, 0, 0, 0),
    ymaskhi = v_set_maskd(0, 1, 2, 3, 4, 8, 11, 14),
    zmasklo = v_set_maskd(2, 5, 8, 11, 14, 0, 0, 0),
    zmaskhi = v_set_maskd(0, 1, 2, 3, 4, 9, 12, 15);

  union {
    vreal d;
    vint l;
    iv16 i;
  } v0u, v1u, v2u, xtu, ytu, ztu, t0u, t1u, t2u;
  
  t0u.d = v_ldd(base);
  t1u.d = v_ldd(base + 8);
  t2u.d = v_ldd(base + 16);

  asm("vshfw %1, %2, %3, %0\n\t" : "=r"(xtu.d): "r"(t0u.d), "r"(t1u.d), "r"(xmasklo));
  asm("vshfw %1, %2, %3, %0\n\t" : "=r"(v0u.d): "r"(xtu.d), "r"(t2u.d), "r"(xmaskhi));
  asm("vshfw %1, %2, %3, %0\n\t" : "=r"(ytu.d): "r"(t0u.d), "r"(t1u.d), "r"(ymasklo));
  asm("vshfw %1, %2, %3, %0\n\t" : "=r"(v1u.d): "r"(ytu.d), "r"(t2u.d), "r"(ymaskhi));
  asm("vshfw %1, %2, %3, %0\n\t" : "=r"(ztu.d): "r"(t0u.d), "r"(t1u.d), "r"(zmasklo));
  asm("vshfw %1, %2, %3, %0\n\t" : "=r"(v2u.d): "r"(ztu.d), "r"(t2u.d), "r"(zmaskhi));
  *v0 = v0u.d;
  *v1 = v1u.d;
  *v2 = v2u.d;
}
#endif
static __wrapped_intrinsic__ int v_any(vreal v) {
  int ret;
  asm("ucmpeqx %1, $31, %0\n\t":"=r"(ret) : "r"(v));
  return !ret;
}

static __wrapped_intrinsic__ vreal v_sqrt(vreal x) {
  return __builtin_sw_slave_vsqrtd(x);
}
static __wrapped_intrinsic__ vreal v_divd(vreal x, vreal y) {
  return x / y;
}
static __wrapped_intrinsic__ vreal v_invsqrt(vreal x) {
  return __builtin_sw_slave_vcpyfd(1.0) / __builtin_sw_slave_vsqrtd(x);
  //#error "no impl"
}
static void vint_print(vint b) {
  long *tmp = (long*)&b;
  printf("%lx %lx %lx %lx %lx %lx %lx %lx\n", tmp[0], tmp[1], tmp[2], tmp[3], tmp[4], tmp[5], tmp[6], tmp[7]);
}
#endif
#endif
